## Predict on Experimental Data ##


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from scipy.stats.stats import pearsonr

import sys
sys.path.append('./classifier/')
import data_loader
import vectorizer
# import best_model

import pickle
import pandas as pd
import re
import os
import random
import numpy as np
seed_value= 42 # random seed of 42 for all experiments
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)


def get_prob_narr(test_fnames):
    """
    Returns a dictionary mapping test-filenames to a probability-narrative.
    """
    if model == 'within-401': # trained on Reader-Annotated data
        train_fnames, Y = data_loader.load_annotated_data(threshold=2.5)
        print("Using Annotated-Data only..", len(Y))
        X_train, X_test = vectorizer.all_feature_categories_uni(train_fnames, test_fnames)
        algo = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=seed_value) # the best pos-TMV parameters
        
    elif model == 'within-401-pos-tense':
        train_fnames, Y = data_loader.load_annotated_data(threshold=2.5)
        print("Using Annotated-Data only..", len(Y))
        X_train, X_test = vectorizer.pos_tense(train_fnames, test_fnames)
        algo = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=seed_value)
        
    elif model == 'within-401-pos-mood':
        train_fnames, Y = data_loader.load_annotated_data(threshold=2.5)
        print("Using Annotated-Data only..", len(Y))
        X_train, X_test = vectorizer.pos_mood(train_fnames, test_fnames)
        algo = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=seed_value)
        
    elif model == 'within-401-pos-tmv-quoted':
        train_fnames, Y = data_loader.load_annotated_data(threshold=2.5)
        print("Using Annotated-Data only..", len(Y))
        X_train, X_test = vectorizer.pos_tmv_quoted(train_fnames, test_fnames)
        algo = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=seed_value)
        

    print("Train files:", len(train_fnames), X_train.shape, len(Y), "| Test files:", len(test_fnames), X_test.shape)
    
    algo.fit(X_train, Y)

    pred_probs = algo.predict_proba(X_test)
    preds = algo.predict(X_test)

    map_fname_probnarr = {}
    for fname, probs, pred in zip(test_fnames, pred_probs, preds):
        prob_narr = probs[1] # second element (['NEG', 'POS'])
        if prob_narr > 0.5:
            assert pred == 'POS'
        else:
            assert pred == 'NEG'
        map_fname_probnarr[fname] = prob_narr
        
    print("Ordering:", algo.classes_.tolist(), "| Predictions:", len(pred_probs), len(preds))
    return map_fname_probnarr


def main(folder_name):
    """
    Saves the narrative-probabilities to a TSV.
    """
    with open(p+'pickles/'+folder_name+'_tense_mood_voice_features_lite.pickle', 'rb') as f:
        d = pickle.load(f) # created via pickle_features.py

    test_fnames = list(d.keys())
    map_fname_probnarr = get_prob_narr(test_fnames)
    print("\n\nWrite to TSV..")
    # Write the probabilities to a TSV:
    with open(p+'new-results/experimental-data/'+model+"__"+folder_name+'.tsv', 'w') as f:
        f.write('Filename\tProbability-Narrative\n')
        for fname, prob in map_fname_probnarr.items():
            f.write(fname+'\t'+str(prob)+'\n')

            
if __name__ == '__main__':
    p = '/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/'
    model = 'within-401-pos-tmv-quoted' # 'within-401' or 'within-401-pos-mood' or 'within-401-pos-tense' or 'trained-12k'
    
#     main('SCIENCE-JSTOR')
#     main('POETRY')
    main('SCIENCE-ROYAL')
